R数据处理 · 柳溪的个人博客

R 数据处理

本节数据处理主要是针对data.frame数据结构，由最基本的方法到运用dplyr包，reshape包等对数据进行加工及运算。
下面通过具体实例学习R数据处理。

这是三家公司的营收与利润数额。如果大家想一步步跟随本文进行尝试，那么请将下列内容输入（或者直接复制加粘贴）到自己的R终端窗口当中：

fy <- c(2010,2011,2012,2010,2011,2012,2010,2011,2012);
company <- c("Apple","Apple","Apple","Google","Google","Google","Microsoft","Microsoft","Microsoft"); 
revenue <- c(65225,108249,156508,29321,37905,50175,62484,69943,73723);  
profit <- c(14013,25922,41733,8505,9737,10737,18760,23150,16978);
companiesData <- data.frame(fy, company, revenue, profit);
companiesData$fy <- as.factor(companiesData$fy);
companiesData;
str(companiesData)

	fy	company	revenue	profit
1	2010	Apple	65225	14013
2	2011	Apple	108249	25922
3	2012	Apple	156508	41733
4	2010	Google	29321	8505
5	2011	Google	37905	9737
6	2012	Google	50175	10737
7	2010	Microsoft	62484	18760
8	2011	Microsoft	69943	23150
9	2012	Microsoft	73723	16978

'data.frame':    9 obs. of  4 variables:
 $ fy     : Factor w/ 3 levels "2010","2011",..: 1 2 3 1 2 3 1 2 3
 $ company: Factor w/ 3 levels "Apple","Google",..: 1 1 1 2 2 2 3 3 3
 $ revenue: num  65225 108249 156508 29321 37905 ...
 $ profit : num  14013 25922 41733 8505 9737 ...

1 向现有数据框添加column

通过将利润除以营收再乘以100的方式添加一个“margin”（利润率）column：

1.1

companiesData$margin <- (companiesData$profit / companiesData$revenue)*100;
companiesData;

	fy	company	revenue	profit	margin
1	2010	Apple	65225	14013	21.48409
2	2011	Apple	108249	25922	23.94664
3	2012	Apple	156508	41733	26.66509
4	2010	Google	29321	8505	29.00651
5	2011	Google	37905	9737	25.6879
6	2012	Google	50175	10737	21.3991
7	2010	Microsoft	62484	18760	30.02369
8	2011	Microsoft	69943	23150	33.09838
9	2012	Microsoft	73723	16978	23.02945

companiesData$margin <- round(companiesData$margin, 1);
companiesData;

	fy	company	revenue	profit	margin
1	2010	Apple	65225	14013	21.5
2	2011	Apple	108249	25922	23.9
3	2012	Apple	156508	41733	26.7
4	2010	Google	29321	8505	29
5	2011	Google	37905	9737	25.7
6	2012	Google	50175	10737	21.4
7	2010	Microsoft	62484	18760	30
8	2011	Microsoft	69943	23150	33.1
9	2012	Microsoft	73723	16978	23

1.2 transform 函数

companiesData <- transform(companiesData, margin = (profit/revenue) * 100);
companiesData

	fy	company	revenue	profit	margin
1	2010	Apple	65225	14013	21.48409
2	2011	Apple	108249	25922	23.94664
3	2012	Apple	156508	41733	26.66509
4	2010	Google	29321	8505	29.00651
5	2011	Google	37905	9737	25.6879
6	2012	Google	50175	10737	21.3991
7	2010	Microsoft	62484	18760	30.02369
8	2011	Microsoft	69943	23150	33.09838
9	2012	Microsoft	73723	16978	23.02945

companiesData <- transform(companiesData, margin = round((profit/revenue) * 100, 1));
companiesData

	fy	company	revenue	profit	margin
1	2010	Apple	65225	14013	21.5
2	2011	Apple	108249	25922	23.9
3	2012	Apple	156508	41733	26.7
4	2010	Google	29321	8505	29
5	2011	Google	37905	9737	25.7
6	2012	Google	50175	10737	21.4
7	2010	Microsoft	62484	18760	30
8	2011	Microsoft	69943	23150	33.1
9	2012	Microsoft	73723	16978	23

1.3 apply函数

dataFrame$newColumn <- apply(dataFrame, 1, function(x) { . . . } );
1代表对row应用函数，2代表对列应用函数

companiesData$margin <- apply(companiesData[,c('revenue', 'profit')], 1, function(x) { (x[2]/x[1]) * 100 });
companiesData

	fy	company	revenue	profit	margin
1	2010	Apple	65225	14013	21.48409
2	2011	Apple	108249	25922	23.94664
3	2012	Apple	156508	41733	26.66509
4	2010	Google	29321	8505	29.00651
5	2011	Google	37905	9737	25.6879
6	2012	Google	50175	10737	21.3991
7	2010	Microsoft	62484	18760	30.02369
8	2011	Microsoft	69943	23150	33.09838
9	2012	Microsoft	73723	16978	23.02945

companiesData$margin <- mapply(function(x, y) round((x/y) * 100, 1)companiesData$profit, companiesData$revenue);
companiesData

	fy	company	revenue	profit	margin
1	2010	Apple	65225	14013	21.5
2	2011	Apple	108249	25922	23.9
3	2012	Apple	156508	41733	26.7
4	2010	Google	29321	8505	29
5	2011	Google	37905	9737	25.7
6	2012	Google	50175	10737	21.4
7	2010	Microsoft	62484	18760	30
8	2011	Microsoft	69943	23150	33.1
9	2012	Microsoft	73723	16978	23

highestMargin <- companiesData[companiesData$margin == max(companiesData$margin),];
highestMargin

	fy	company	revenue	profit	margin
8	2011	Microsoft	69943	23150	33.1

highestMargin <- subset(companiesData, margin==max(margin));
highestMargin

	fy	company	revenue	profit	margin
8	2011	Microsoft	69943	23150	33.1

library(plyr);
highestProfitMargins <- ddply(companiesData, 'company', summarize, bestMargin = max(margin));
highestProfitMargins

	company	bestMargin
1	Apple	26.7
2	Google	29
3	Microsoft	33.1

myResults <- ddply(companiesData, 'company', transform, highestMargin = max(margin), lowestMargin = min(margin));
myResults

	fy	company	revenue	profit	margin	highestMargin	lowestMargin
1	2010	Apple	65225	14013	21.5	26.7	21.5
2	2011	Apple	108249	25922	23.9	26.7	21.5
3	2012	Apple	156508	41733	26.7	26.7	21.5
4	2010	Google	29321	8505	29	29	21.4
5	2011	Google	37905	9737	25.7	29	21.4
6	2012	Google	50175	10737	21.4	29	21.4
7	2010	Microsoft	62484	18760	30	33.1	23
8	2011	Microsoft	69943	23150	33.1	33.1	23
9	2012	Microsoft	73723	16978	23	33.1	23

highestProfitMargins <- ddply(companiesData, 'company', function(x) x[x$margin==max(x$margin),]);
highestProfitMargins

	fy	company	revenue	profit	margin
1	2012	Apple	156508	41733	26.7
2	2010	Google	29321	8505	29
3	2011	Microsoft	69943	23150	33.1

companiesData[companiesDate\$margin==max(companiesData$margin),];

这部分代码本身并没有使用ddply()，因此得到的是最高全局利润率而非各公司最高利润率。不过由于匿名函数被加入到ddply()声明当中，而ddply()又已经按照公司名称对数据框进行了分割，所以返回的匹配行仍然符合按不同公司计算的要求。

要利用ddply()查看整个数据集中的最高利润率，而非根据公司category划分的子集，我们可以在第二个参数中输入NULL使分割factor变为无效：

highestProfitMargin <- ddply(companiesData, NULL, summarize, bestMargin = max(margin));
highestProfitMargin

	.id	bestMargin
1	NA	33.1

companiesOrdered <- companiesData[order(-companiesData$margin),];
companiesOrdered

	fy	company	revenue	profit	margin
8	2011	Microsoft	69943	23150	33.1
7	2010	Microsoft	62484	18760	30
4	2010	Google	29321	8505	29
3	2012	Apple	156508	41733	26.7
5	2011	Google	37905	9737	25.7
2	2011	Apple	108249	25922	23.9
9	2012	Microsoft	73723	16978	23
1	2010	Apple	65225	14013	21.5
6	2012	Google	50175	10737	21.4

companiesData[order(companiesData$fy, -companiesData$margin),];
companiesData

	fy	company	revenue	profit	margin
7	2010	Microsoft	62484	18760	30
4	2010	Google	29321	8505	29
1	2010	Apple	65225	14013	21.5
8	2011	Microsoft	69943	23150	33.1
5	2011	Google	37905	9737	25.7
2	2011	Apple	108249	25922	23.9
3	2012	Apple	156508	41733	26.7
9	2012	Microsoft	73723	16978	23
6	2012	Google	50175	10737	21.4

	fy	company	revenue	profit	margin
1	2010	Apple	65225	14013	21.5
2	2011	Apple	108249	25922	23.9
3	2012	Apple	156508	41733	26.7
4	2010	Google	29321	8505	29
5	2011	Google	37905	9737	25.7
6	2012	Google	50175	10737	21.4
7	2010	Microsoft	62484	18760	30
8	2011	Microsoft	69943	23150	33.1
9	2012	Microsoft	73723	16978	23

library(reshape2);
companiesLong <- melt(companiesData, c("fy", "company"));
companiesLong

	fy	company	variable	value
1	2010	Apple	revenue	65225
2	2011	Apple	revenue	108249
3	2012	Apple	revenue	156508
4	2010	Google	revenue	29321
5	2011	Google	revenue	37905
6	2012	Google	revenue	50175
7	2010	Microsoft	revenue	62484
8	2011	Microsoft	revenue	69943
9	2012	Microsoft	revenue	73723
10	2010	Apple	profit	14013
11	2011	Apple	profit	25922
12	2012	Apple	profit	41733
13	2010	Google	profit	8505
14	2011	Google	profit	9737
15	2012	Google	profit	10737
16	2010	Microsoft	profit	18760
17	2011	Microsoft	profit	23150
18	2012	Microsoft	profit	16978
19	2010	Apple	margin	21.5
20	2011	Apple	margin	23.9
21	2012	Apple	margin	26.7
22	2010	Google	margin	29
23	2011	Google	margin	25.7
24	2012	Google	margin	21.4
25	2010	Microsoft	margin	30
26	2011	Microsoft	margin	33.1
27	2012	Microsoft	margin	23

companiesLong <- melt(companiesData, id.vars=c("fy", "company"),
                  measure.vars=c("revenue", "profit", "margin"),
                  variable.name="financialCategory", value.name="amount");
companiesLong

	fy	company	financialCategory	amount
1	2010	Apple	revenue	65225
2	2011	Apple	revenue	108249
3	2012	Apple	revenue	156508
4	2010	Google	revenue	29321
5	2011	Google	revenue	37905
6	2012	Google	revenue	50175
7	2010	Microsoft	revenue	62484
8	2011	Microsoft	revenue	69943
9	2012	Microsoft	revenue	73723
10	2010	Apple	profit	14013
11	2011	Apple	profit	25922
12	2012	Apple	profit	41733
13	2010	Google	profit	8505
14	2011	Google	profit	9737
15	2012	Google	profit	10737
16	2010	Microsoft	profit	18760
17	2011	Microsoft	profit	23150
18	2012	Microsoft	profit	16978
19	2010	Apple	margin	21.5
20	2011	Apple	margin	23.9
21	2012	Apple	margin	26.7
22	2010	Google	margin	29
23	2011	Google	margin	25.7
24	2012	Google	margin	21.4
25	2010	Microsoft	margin	30
26	2011	Microsoft	margin	33.1
27	2012	Microsoft	margin	23

companiesWide <- dcast(companiesLong, fy + company ~ financialCategory, value.var="amount");
companiesWide

	fy	company	revenue	profit	margin
1	2010	Apple	65225	14013	21.5
2	2010	Google	29321	8505	29
3	2010	Microsoft	62484	18760	30
4	2011	Apple	108249	25922	23.9
5	2011	Google	37905	9737	25.7
6	2011	Microsoft	69943	23150	33.1
7	2012	Apple	156508	41733	26.7
8	2012	Google	50175	10737	21.4
9	2012	Microsoft	73723	16978	23

dcast()会将long数据框的名称作为第一项参数。大家需要遵循以下语法创建公式来充当第二项参数：
id variables ~ variable variables
其中ID与measurement变量由波浪线隔开。如果波浪线两端的变量不止一个，那么各变量彼此之间要用“加号”分隔。
第三项参数用于让dcast()将包含measurement值的列名称分配给value.var。

第二节将介绍data.table包，这是一个功能更加强大，速度更快的数据处理包。